********************************************************************************
*Clean HCRIS data
*Run on Elevance server

*FTE variable generation steps based on replication code from "Employer Consolidation 
*and Wages: Evidence from Hospitals" by Prager and Schmidt (2021), available here: https://www.aeaweb.org/articles?id=10.1257/aer.20190690
********************************************************************************
global SOURCE_BASE "\replication\public_data\hcris_data"
cd "$SOURCE_BASE\output"
	
********************************************************************************
*1. Generate vars 
	use "hcris_merged_hospyear.dta", clear
	destring pn, replace force
	gduplicates report pn ye
	ren pn num_prvdr_num
	capture noisily drop nreports nfmt* frac_year covg_*
	
	*LABOR VARS
	{
	local contracthrs s32_contr_mgmt_hrsoth s32_contr_physadm_hrsoth s32_contr_ptntcare_hrsoth s32_contr_interns_hrs 
	local contractohhrs s32_oh_dietcontr_hrs s32_oh_gacontr_hrs s32_oh_housecontr_hrs 
	local ohhrs s32_oh_cafe_hrs s32_oh_diet_hrs s32_oh_ga_hrs s32_oh_house_hrs s32_oh_laundry_hrs s32_oh_maint_hrs s32_oh_maintperson_hrs s32_oh_nursadm_hrs s32_oh_ops_hrs s32_oh_othga_hrs s32_oh_pharmacy_hrs s32_oh_records_hrs s32_oh_ss_hrs s32_oh_supply_hrs  s32_oh_benefits_hrs 	
	
	*restrict to relevant timeframe
	local s32 s32* 
	unab s32: `s32'
	foreach var in `s32' {
	replace `var' = . if ye==2011
	}
	
	*drop vars if < 5% populated 
	local otherhrs
	foreach var in `ohhrs' {
	count if !missing(`var')
	if r(N) < 2000 {
	di "`var'"
	drop `var'	
	}
	else {
	local otherhrs `otherhrs' `var'	
	}
	}
	
	*gen FTE vars
	local days = 365
	foreach var in `contracthrs' `contractohhrs' `otherhrs' {
	gen fte`var' = ((365*`var'/`days')/52)/40
	drop `var'
	}
	local fte fte*
	unab fte: `fte'
	
	*total contract FTES (excludes overhead contract FTES)
	*total overhead FTES (includes overhead contract FTES) 
	gegen totcontractfte = rowtotal(ftes32_contr_mgmt_hrsoth ftes32_contr_physadm_hrsoth ftes32_contr_ptntcare_hrsoth)
	gegen totcontractohfte = rowtotal(ftes32_oh_dietcontr_hrs ftes32_oh_gacontr_hrs ftes32_oh_housecontr_hrs)
	gegen totohfte = rowtotal(ftes32_oh_*_hrs)
	
	ren totohfte totohfte2
	gegen totohfte = rowtotal(totohfte2 totcontractohfte)
	drop totohfte2

	foreach var in totcontractfte totcontractohfte totohfte {
	replace `var' = . if ye==2011
	}
	
	*drop negative FTEs (Prager/Schmidt)
	keep num ye tot* g3*
	local all totcontractfte totcontractohfte totohfte
	foreach var in `all' {
	replace `var' = . if `var' < 0
	}
	
	*Trim 5% outliers (by year)
	foreach var in `all' {
	forval i = 2012/2018 {
	qui sum `var' if ye==`i', d
	replace `var' = . if !inrange(`var',r(p5),r(p95)) & ye==`i'
	}
	}
	
	*Trim outliers from hospital median
	foreach var in `all' {
	bysort num: gegen med = median(`var')
	gen pctdif = `var' / med - 1
	replace `var' = . if !inrange(pctdif,-.9,8)
	drop pctdif med
	}
	
	*Impute missing vars 
	sort num ye 
	foreach var in `all' {
	by num: replace `var' = (`var'[_n-1] + `var'[_n+1])/2 if missing(`var')
	by num: replace `var' = `var'[_n-1] + 1/3*(`var'[_n+2] - `var'[_n-1]) if missing(`var')
	by num: replace `var' = `var'[_n+1] + 1/3*(`var'[_n+1] - `var'[_n-2]) if missing(`var')
	by num: replace `var' = (`var'[_n-2] + `var'[_n+2])/2 if missing(`var')
	}
	}
	
	*MARGIN VARS 
	{
	*merge to AHA sumstat sample 
	local keep ofint hospty absorb* hosp_c incl* prev never 
	merge 1:1 num ye using "$aha\aha_merge6_1", keepusing(`keep')
	keep if _m==3
	drop _m 
	gen keep = 0 
	replace keep = 1 if ofint==1
	replace keep = 1 if hospty=="gac"
	replace keep = 0 if absorbed==1 | absorbing==1
	replace keep = 0 if hosp_c=="fed"
	
	sum g3*rev* if keep, d
	foreach var in g3_totrev g3_netrev_ptnts  {
	replace `var' = . if `var' < 0 
	}
	
	gen mgn_tot     = g3_netincome / g3_totrev
	gen mgn_ptnt    = g3_netincome_ptnts / g3_netrev_ptnts 
	sum mgn* if keep, d

	*Trim 5% outliers from analysis sample
	foreach var in mgn_tot mgn_ptnt {
	qui sum `var' if keep, d
	replace `var' = . if `var' < r(p5) & !missing(`var')
	replace `var' = . if `var' > r(p95) & !missing(`var')
	}
	sum mgn* if keep, d
	
	keep num ye tot* g3* mgn* keep 
	label var keep "in relev sample"
	qui compress 
	sort num ye 
	}	
	save "hcris_cleaned.dta", replace